Pandas - Python Data Analysis

Upgrade PIP

Windows

python -m pip install -U pip

Linux

pip install -U pip

Install Packages if required


In [67]:
!pip install wheel


Requirement already satisfied (use --upgrade to upgrade): wheel in c:\anaconda\lib\site-packages

Importing required libraries


In [221]:
%matplotlib inline
from pandas import *
from pylab import *
import pandas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import HTML
from __future__ import division

def side_by_side(*objs, **kwds):
    from pandas.formats.printing import adjoin
    space = kwds.get('space', 4)
    reprs = [repr(obj).split('\n') for obj in objs]
    print adjoin(space, *reprs)
    
def htmldisp(x):
    try:
        return HTML(pd.DataFrame(x).to_html())
    except:
        print ("Alert!! \nInput should be a Pandas obj.")

plt.rc('figure', figsize=(10, 6))
# pandas.set_printoptions(notebook_repr_html=False)

In [69]:
plt.plot([1,2,4,8,16])


Out[69]:
[<matplotlib.lines.Line2D at 0x8c06fd0>]

Series


In [70]:
np.random.randn(5)


Out[70]:
array([-0.66537514, -0.5677748 , -0.53418025,  0.51045839,  0.11856411])

In [71]:
labels = ['a', 'b', 'c', 'd', 'e']
s = pd.Series(np.random.randn(5), index=labels)

In [72]:
'b' in s


Out[72]:
True

In [73]:
s['b']


Out[73]:
1.468315359355616

In [74]:
s.index


Out[74]:
Index([u'a', u'b', u'c', u'd', u'e'], dtype='object')

In [75]:
s


Out[75]:
a   -0.502969
b    1.468315
c   -0.586066
d   -0.037522
e    0.229438
dtype: float64

In [76]:
mapping = s.to_dict()
mapping


Out[76]:
{'a': -0.50296937834505506,
 'b': 1.468315359355616,
 'c': -0.58606560139888719,
 'd': -0.037522292320861968,
 'e': 0.229438447928906}

In [77]:
s = pd.Series(mapping, index=['b', 'e', 'a', 'd', 'f'])
s


Out[77]:
b    1.468315
e    0.229438
a   -0.502969
d   -0.037522
f         NaN
dtype: float64

In [78]:
notnull(s)


Out[78]:
b     True
e     True
a     True
d     True
f    False
dtype: bool

In [79]:
s[notnull(s)]


Out[79]:
b    1.468315
e    0.229438
a   -0.502969
d   -0.037522
dtype: float64

In [80]:
s.dropna()


Out[80]:
b    1.468315
e    0.229438
a   -0.502969
d   -0.037522
dtype: float64

In [81]:
s * 2


Out[81]:
b    2.936631
e    0.458877
a   -1.005939
d   -0.075045
f         NaN
dtype: float64

In [82]:
s[3:]


Out[82]:
d   -0.037522
f         NaN
dtype: float64

In [83]:
s[:3]


Out[83]:
b    1.468315
e    0.229438
a   -0.502969
dtype: float64

In [84]:
s.index


Out[84]:
Index([u'b', u'e', u'a', u'd', u'f'], dtype='object')

DataFrame: 2D collection of Series


In [85]:
df = DataFrame({'a': np.random.randn(6),
                'b': ['foo', 'bar'] * 3,
                'c': np.random.randn(6)})
df['d'] = range(6)
df


Out[85]:
a b c d
0 -0.704317 foo -1.782846 0
1 -0.768235 bar -0.916710 1
2 0.335652 foo 0.732409 2
3 -1.040598 bar -0.042354 3
4 0.021351 foo -0.096634 4
5 -0.873731 bar 1.802349 5

In [86]:
df['b']    # by column


Out[86]:
0    foo
1    bar
2    foo
3    bar
4    foo
5    bar
Name: b, dtype: object

In [87]:
df[:3]


Out[87]:
a b c d
0 -0.704317 foo -1.782846 0
1 -0.768235 bar -0.916710 1
2 0.335652 foo 0.732409 2

In [88]:
df[:-2]


Out[88]:
a b c d
0 -0.704317 foo -1.782846 0
1 -0.768235 bar -0.916710 1
2 0.335652 foo 0.732409 2
3 -1.040598 bar -0.042354 3

In [89]:
df[-2:]   # the last 2 rows


Out[89]:
a b c d
4 0.021351 foo -0.096634 4
5 -0.873731 bar 1.802349 5

In [90]:
df[['a','b']]


Out[90]:
a b
0 -0.704317 foo
1 -0.768235 bar
2 0.335652 foo
3 -1.040598 bar
4 0.021351 foo
5 -0.873731 bar

In [91]:
np.tile(['foo', 'bar'], 3)   # in numpy


Out[91]:
array(['foo', 'bar', 'foo', 'bar', 'foo', 'bar'], 
      dtype='|S3')

In [92]:
df.xs(0)   # returning the first row


Out[92]:
a   -0.704317
b         foo
c    -1.78285
d           0
Name: 0, dtype: object

In [93]:
df.ix[2]    # does equal thing as above. In this case, 3rd row.


Out[93]:
a    0.335652
b         foo
c    0.732409
d           2
Name: 2, dtype: object

In [94]:
df.ix[2, 'b']


Out[94]:
'foo'

In [95]:
timeit df.ix[1]


10000 loops, best of 3: 112 µs per loop

In [96]:
df.get_value(2, 'b')


Out[96]:
'foo'

In [97]:
timeit df.ix[2, 'b']


The slowest run took 8.82 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 4.97 µs per loop

In [98]:
timeit df.get_value(2, 'b')


The slowest run took 11.29 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 2.41 µs per loop

In [99]:
df.ix[2:4, 'b']


Out[99]:
2    foo
3    bar
4    foo
Name: b, dtype: object

In [100]:
df.ix[2:4, ['b', 'c']]


Out[100]:
b c
2 foo 0.732409
3 bar -0.042354
4 foo -0.096634

In [101]:
df.ix[2:4, 'b':'c']   # slices by column - essentially the same as above.


Out[101]:
b c
2 foo 0.732409
3 bar -0.042354
4 foo -0.096634

In [102]:
df.ix[2:4, 0:2]


Out[102]:
a b
2 0.335652 foo
3 -1.040598 bar
4 0.021351 foo

In [103]:
df.ix[[0, 2, 4], ['b', 'c', 'd']]   # pass a list of rows and columns I want to select out


Out[103]:
b c d
0 foo -1.782846 0
2 foo 0.732409 2
4 foo -0.096634 4

In [104]:
df['c'] > 0


Out[104]:
0    False
1    False
2     True
3    False
4    False
5     True
Name: c, dtype: bool

In [105]:
df.ix[df['c'] > 0]   # boolean arrays


Out[105]:
a b c d
2 0.335652 foo 0.732409 2
5 -0.873731 bar 1.802349 5

In [106]:
df.index


Out[106]:
RangeIndex(start=0, stop=6, step=1)

In [107]:
df.columns


Out[107]:
Index([u'a', u'b', u'c', u'd'], dtype='object')

In [108]:
pandas.date_range('1/1/2000',periods=6)


Out[108]:
DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03', '2000-01-04',
               '2000-01-05', '2000-01-06'],
              dtype='datetime64[ns]', freq='D')

In [109]:
df = DataFrame({'a': np.random.randn(6),
                'b': ['foo', 'bar'] * 3,
                'c': np.random.randn(6)},
               index=pandas.date_range('1/1/2000', periods=6))
df


Out[109]:
a b c
2000-01-01 1.269364 foo 0.825877
2000-01-02 1.037640 bar -1.477966
2000-01-03 0.621788 foo -1.342809
2000-01-04 1.180793 bar -0.799461
2000-01-05 0.824102 foo -0.887313
2000-01-06 0.047541 bar 1.362241

In [110]:
df = DataFrame({'a': np.random.randn(6),
                'b': ['foo', 'bar'] * 3,
                'c': np.random.randn(6)},
               columns=['a', 'b', 'c', 'd'])
df


Out[110]:
a b c d
0 -1.283341 foo -0.172198 NaN
1 -1.511962 bar 0.357274 NaN
2 0.327979 foo 0.308704 NaN
3 0.825253 bar -2.168154 NaN
4 -0.347085 foo 0.032781 NaN
5 -0.051206 bar 1.292636 NaN

In [111]:
isnull(df)


Out[111]:
a b c d
0 False False False True
1 False False False True
2 False False False True
3 False False False True
4 False False False True
5 False False False True

Creation from nested dicts

These arise naturally in Python code


In [112]:
data = {}
for col in ['foo', 'bar', 'baz']:
    for row in ['a', 'b', 'c', 'd']:
        data.setdefault(col, {})[row] = np.random.randn()   # sorted cols: bar, baz, foo
data


Out[112]:
{'bar': {'a': 0.38112931907002495,
  'b': -0.765909219644571,
  'c': 0.4985234895811544,
  'd': -0.7330625057267428},
 'baz': {'a': 1.7271692781624934,
  'b': 0.7570402996083134,
  'c': 1.2739907718089785,
  'd': -0.20450433532235054},
 'foo': {'a': 0.1980977136126971,
  'b': 1.211788252583382,
  'c': -1.0048549547865768,
  'd': -1.1082866658145967}}

In [113]:
del data ['foo']['c']   # delete an entry by index key (col) and row.

In [114]:
DataFrame(data)   # pass a dict


Out[114]:
bar baz foo
a 0.381129 1.727169 0.198098
b -0.765909 0.757040 1.211788
c 0.498523 1.273991 NaN
d -0.733063 -0.204504 -1.108287

Data alignment


In [115]:
close_px = read_csv('stock_data.csv', index_col=0, parse_dates=True)

In [172]:
!head -n 10 stock_data.csv


'head' is not recognized as an internal or external command,
operable program or batch file.

In [117]:
!head  stock_data.csv


'head' is not recognized as an internal or external command,
operable program or batch file.

In [118]:
close_px


Out[118]:
AA AAPL GE IBM JNJ MSFT PEP SPX XOM
2007-10-29 37.41 185.09 34.46 106.78 57.13 31.78 65.67 1540.98 85.51
2007-10-30 36.43 187.00 34.39 106.15 56.99 32.70 65.80 1531.02 83.25
2007-10-31 36.79 189.95 34.97 108.01 57.30 33.84 65.69 1549.38 84.03
2007-11-01 35.22 187.44 34.27 105.72 56.85 34.07 64.51 1508.44 80.84
2007-11-02 35.83 187.87 34.27 106.59 56.95 34.07 65.03 1509.65 80.32
2007-11-05 35.22 186.18 34.15 105.48 56.70 33.77 64.84 1502.17 80.07
2007-11-06 35.84 191.79 34.14 105.27 56.80 33.47 65.49 1520.27 82.56
2007-11-07 34.76 186.30 33.20 103.69 56.19 32.65 64.46 1475.62 79.96
2007-11-08 35.00 175.47 33.15 99.05 56.79 31.94 65.50 1474.77 82.00
2007-11-09 34.47 165.37 32.61 93.58 57.29 31.01 65.48 1453.70 79.64
2007-11-12 33.33 153.76 32.50 94.70 58.30 30.69 64.67 1439.18 77.52
2007-11-13 34.04 169.96 33.31 98.27 59.18 31.79 65.35 1481.05 79.67
2007-11-14 34.25 166.11 33.14 96.56 58.81 31.30 65.99 1470.58 79.15
2007-11-15 33.76 164.30 32.55 96.71 58.80 31.14 65.57 1451.15 77.48
2007-11-16 33.74 166.39 32.84 97.82 59.56 31.44 66.08 1458.74 78.04
2007-11-19 32.59 163.95 32.42 95.42 59.42 31.32 66.90 1433.27 77.13
2007-11-20 32.88 168.85 32.32 96.54 59.47 31.90 66.69 1439.70 80.53
2007-11-21 32.68 168.46 31.58 95.42 59.03 31.57 66.87 1416.77 79.82
2007-11-23 32.66 171.54 32.01 97.13 59.16 31.46 67.29 1440.70 80.96
2007-11-26 32.41 172.54 31.21 95.19 59.60 30.41 66.56 1407.22 78.57
2007-11-27 32.56 174.81 31.82 96.92 59.92 30.49 67.65 1428.23 79.21
2007-11-28 33.65 180.22 32.68 100.23 60.46 31.08 68.76 1469.02 80.62
2007-11-29 33.97 184.29 32.40 100.35 60.51 30.98 68.20 1469.72 81.24
2007-11-30 33.80 182.22 32.53 98.18 59.93 30.99 68.78 1481.14 81.76
2007-12-03 32.98 178.86 31.38 98.79 59.90 30.36 68.24 1472.42 81.48
2007-12-04 32.76 179.81 30.80 99.54 60.10 30.23 68.30 1462.79 80.81
2007-12-05 33.09 185.50 31.19 100.97 60.34 31.50 68.30 1485.01 82.46
2007-12-06 33.32 189.95 31.66 102.40 60.42 31.87 68.98 1507.34 83.85
2007-12-07 34.30 194.30 31.63 101.62 59.87 31.85 68.95 1504.66 83.91
2007-12-10 35.17 194.21 31.78 102.11 59.97 32.06 69.20 1515.96 84.39
... ... ... ... ... ... ... ... ... ...
2011-09-02 12.04 374.05 15.61 166.98 64.07 25.80 63.30 1173.97 72.14
2011-09-06 11.77 379.74 15.11 165.11 64.64 25.51 62.45 1165.24 71.15
2011-09-07 12.25 383.93 15.65 167.31 65.43 26.00 61.62 1198.62 73.65
2011-09-08 12.03 384.14 15.44 165.25 64.95 26.22 61.34 1185.90 72.82
2011-09-09 11.58 377.48 14.95 161.37 63.64 25.74 59.99 1154.23 71.01
2011-09-12 11.55 379.94 14.87 162.42 63.59 25.89 60.14 1162.27 71.84
2011-09-13 11.63 384.62 15.26 163.43 63.61 26.04 60.54 1172.87 71.65
2011-09-14 11.73 389.30 15.64 167.24 63.73 26.50 61.58 1188.68 72.64
2011-09-15 11.98 392.96 16.08 170.09 64.40 26.99 63.22 1209.11 74.01
2011-09-16 11.97 400.50 16.33 172.99 64.59 27.12 62.05 1216.01 74.55
2011-09-19 11.58 411.63 16.18 173.13 64.14 27.21 60.56 1204.09 73.70
2011-09-20 11.25 413.45 16.04 174.72 64.22 26.98 60.39 1202.09 74.01
2011-09-21 10.84 412.14 15.38 173.02 63.13 25.99 60.79 1166.76 71.97
2011-09-22 10.11 401.82 15.04 168.62 61.92 25.06 60.92 1129.56 69.24
2011-09-23 10.07 404.30 15.21 169.34 61.59 25.06 60.34 1136.43 69.31
2011-09-26 10.45 403.17 15.57 174.51 62.69 25.44 61.89 1162.95 71.72
2011-09-27 10.48 399.26 15.76 177.71 63.82 25.67 62.43 1175.38 72.91
2011-09-28 9.97 397.01 15.45 177.55 63.25 25.58 61.97 1151.06 72.07
2011-09-29 10.06 390.57 15.86 179.17 63.90 25.45 62.58 1160.40 73.88
2011-09-30 9.57 381.32 15.22 174.87 63.69 24.89 61.90 1131.42 72.63
2011-10-03 8.90 374.60 14.69 173.29 62.08 24.53 60.29 1099.23 71.15
2011-10-04 9.12 372.50 14.86 174.74 62.17 25.34 60.45 1123.95 72.83
2011-10-05 9.37 378.25 15.27 176.85 62.35 25.89 60.29 1144.03 73.95
2011-10-06 9.88 377.37 15.53 181.69 62.81 26.34 60.57 1164.97 73.89
2011-10-07 9.71 369.80 15.50 182.39 63.13 26.25 61.02 1155.46 73.56
2011-10-10 10.09 388.81 16.14 186.62 64.43 26.94 61.87 1194.89 76.28
2011-10-11 10.30 400.29 16.14 185.00 63.96 27.00 60.95 1195.54 76.27
2011-10-12 10.05 402.19 16.40 186.12 64.33 26.96 62.70 1207.25 77.16
2011-10-13 10.10 408.43 16.22 186.82 64.23 27.18 62.36 1203.66 76.37
2011-10-14 10.26 422.00 16.60 190.53 64.72 27.27 62.24 1224.58 78.11

1000 rows × 9 columns


In [119]:
s1 = close_px['AAPL'][-20:]      # the last 20 rows
s2 = close_px['AAPL'][-25:-10]   # the last 25 - last 10 = 15 rows.
side_by_side(s1, s2)


2011-09-19    411.63          2011-09-12    379.94      
2011-09-20    413.45          2011-09-13    384.62      
2011-09-21    412.14          2011-09-14    389.30      
2011-09-22    401.82          2011-09-15    392.96      
2011-09-23    404.30          2011-09-16    400.50      
2011-09-26    403.17          2011-09-19    411.63      
2011-09-27    399.26          2011-09-20    413.45      
2011-09-28    397.01          2011-09-21    412.14      
2011-09-29    390.57          2011-09-22    401.82      
2011-09-30    381.32          2011-09-23    404.30      
2011-10-03    374.60          2011-09-26    403.17      
2011-10-04    372.50          2011-09-27    399.26      
2011-10-05    378.25          2011-09-28    397.01      
2011-10-06    377.37          2011-09-29    390.57      
2011-10-07    369.80          2011-09-30    381.32      
2011-10-10    388.81          Name: AAPL, dtype: float64
2011-10-11    400.29                                    
2011-10-12    402.19                                    
2011-10-13    408.43                                    
2011-10-14    422.00                                    
Name: AAPL, dtype: float64                              

In [120]:
s1 + s2


Out[120]:
2011-09-12       NaN
2011-09-13       NaN
2011-09-14       NaN
2011-09-15       NaN
2011-09-16       NaN
2011-09-19    823.26
2011-09-20    826.90
2011-09-21    824.28
2011-09-22    803.64
2011-09-23    808.60
2011-09-26    806.34
2011-09-27    798.52
2011-09-28    794.02
2011-09-29    781.14
2011-09-30    762.64
2011-10-03       NaN
2011-10-04       NaN
2011-10-05       NaN
2011-10-06       NaN
2011-10-07       NaN
2011-10-10       NaN
2011-10-11       NaN
2011-10-12       NaN
2011-10-13       NaN
2011-10-14       NaN
Name: AAPL, dtype: float64

In [121]:
s1.add(s2, fill_value=0)


Out[121]:
2011-09-12    379.94
2011-09-13    384.62
2011-09-14    389.30
2011-09-15    392.96
2011-09-16    400.50
2011-09-19    823.26
2011-09-20    826.90
2011-09-21    824.28
2011-09-22    803.64
2011-09-23    808.60
2011-09-26    806.34
2011-09-27    798.52
2011-09-28    794.02
2011-09-29    781.14
2011-09-30    762.64
2011-10-03    374.60
2011-10-04    372.50
2011-10-05    378.25
2011-10-06    377.37
2011-10-07    369.80
2011-10-10    388.81
2011-10-11    400.29
2011-10-12    402.19
2011-10-13    408.43
2011-10-14    422.00
Name: AAPL, dtype: float64

In [122]:
(s1 + s2).dropna()


Out[122]:
2011-09-19    823.26
2011-09-20    826.90
2011-09-21    824.28
2011-09-22    803.64
2011-09-23    808.60
2011-09-26    806.34
2011-09-27    798.52
2011-09-28    794.02
2011-09-29    781.14
2011-09-30    762.64
Name: AAPL, dtype: float64

In [123]:
df = close_px.ix[-10:, :3]
df


Out[123]:
AA AAPL GE
2011-10-03 8.90 374.60 14.69
2011-10-04 9.12 372.50 14.86
2011-10-05 9.37 378.25 15.27
2011-10-06 9.88 377.37 15.53
2011-10-07 9.71 369.80 15.50
2011-10-10 10.09 388.81 16.14
2011-10-11 10.30 400.29 16.14
2011-10-12 10.05 402.19 16.40
2011-10-13 10.10 408.43 16.22
2011-10-14 10.26 422.00 16.60

In [124]:
side_by_side(s1.reindex(s2.index), s2)   # align indexes of s1 and s2. Print 2 objects alongside each other.


2011-09-12       NaN          2011-09-12    379.94      
2011-09-13       NaN          2011-09-13    384.62      
2011-09-14       NaN          2011-09-14    389.30      
2011-09-15       NaN          2011-09-15    392.96      
2011-09-16       NaN          2011-09-16    400.50      
2011-09-19    411.63          2011-09-19    411.63      
2011-09-20    413.45          2011-09-20    413.45      
2011-09-21    412.14          2011-09-21    412.14      
2011-09-22    401.82          2011-09-22    401.82      
2011-09-23    404.30          2011-09-23    404.30      
2011-09-26    403.17          2011-09-26    403.17      
2011-09-27    399.26          2011-09-27    399.26      
2011-09-28    397.01          2011-09-28    397.01      
2011-09-29    390.57          2011-09-29    390.57      
2011-09-30    381.32          2011-09-30    381.32      
Name: AAPL, dtype: float64    Name: AAPL, dtype: float64

In [125]:
side_by_side(s1.ix[s2.index], s2)   # same as above, supported in newer version of 0.12 ipython.


2011-09-12       NaN          2011-09-12    379.94      
2011-09-13       NaN          2011-09-13    384.62      
2011-09-14       NaN          2011-09-14    389.30      
2011-09-15       NaN          2011-09-15    392.96      
2011-09-16       NaN          2011-09-16    400.50      
2011-09-19    411.63          2011-09-19    411.63      
2011-09-20    413.45          2011-09-20    413.45      
2011-09-21    412.14          2011-09-21    412.14      
2011-09-22    401.82          2011-09-22    401.82      
2011-09-23    404.30          2011-09-23    404.30      
2011-09-26    403.17          2011-09-26    403.17      
2011-09-27    399.26          2011-09-27    399.26      
2011-09-28    397.01          2011-09-28    397.01      
2011-09-29    390.57          2011-09-29    390.57      
2011-09-30    381.32          2011-09-30    381.32      
Name: AAPL, dtype: float64    Name: AAPL, dtype: float64

In [126]:
b, c  = s1.align(s2, join='inner')
side_by_side(b, c)


2011-09-19    411.63          2011-09-19    411.63      
2011-09-20    413.45          2011-09-20    413.45      
2011-09-21    412.14          2011-09-21    412.14      
2011-09-22    401.82          2011-09-22    401.82      
2011-09-23    404.30          2011-09-23    404.30      
2011-09-26    403.17          2011-09-26    403.17      
2011-09-27    399.26          2011-09-27    399.26      
2011-09-28    397.01          2011-09-28    397.01      
2011-09-29    390.57          2011-09-29    390.57      
2011-09-30    381.32          2011-09-30    381.32      
Name: AAPL, dtype: float64    Name: AAPL, dtype: float64

In [127]:
b, c  = s1.align(s2, join='outer')
side_by_side(b, c)


2011-09-12       NaN          2011-09-12    379.94      
2011-09-13       NaN          2011-09-13    384.62      
2011-09-14       NaN          2011-09-14    389.30      
2011-09-15       NaN          2011-09-15    392.96      
2011-09-16       NaN          2011-09-16    400.50      
2011-09-19    411.63          2011-09-19    411.63      
2011-09-20    413.45          2011-09-20    413.45      
2011-09-21    412.14          2011-09-21    412.14      
2011-09-22    401.82          2011-09-22    401.82      
2011-09-23    404.30          2011-09-23    404.30      
2011-09-26    403.17          2011-09-26    403.17      
2011-09-27    399.26          2011-09-27    399.26      
2011-09-28    397.01          2011-09-28    397.01      
2011-09-29    390.57          2011-09-29    390.57      
2011-09-30    381.32          2011-09-30    381.32      
2011-10-03    374.60          2011-10-03       NaN      
2011-10-04    372.50          2011-10-04       NaN      
2011-10-05    378.25          2011-10-05       NaN      
2011-10-06    377.37          2011-10-06       NaN      
2011-10-07    369.80          2011-10-07       NaN      
2011-10-10    388.81          2011-10-10       NaN      
2011-10-11    400.29          2011-10-11       NaN      
2011-10-12    402.19          2011-10-12       NaN      
2011-10-13    408.43          2011-10-13       NaN      
2011-10-14    422.00          2011-10-14       NaN      
Name: AAPL, dtype: float64    Name: AAPL, dtype: float64

b, c = s1.align(s2, join='right') side_by_side(b, c)


In [128]:
df = close_px.ix[-10:, ['AAPL', 'IBM', 'MSFT']]
df


Out[128]:
AAPL IBM MSFT
2011-10-03 374.60 173.29 24.53
2011-10-04 372.50 174.74 25.34
2011-10-05 378.25 176.85 25.89
2011-10-06 377.37 181.69 26.34
2011-10-07 369.80 182.39 26.25
2011-10-10 388.81 186.62 26.94
2011-10-11 400.29 185.00 27.00
2011-10-12 402.19 186.12 26.96
2011-10-13 408.43 186.82 27.18
2011-10-14 422.00 190.53 27.27

In [129]:
df2 = df.ix[::2, ['IBM', 'MSFT']]
side_by_side(df, df2)


              AAPL     IBM   MSFT                   IBM   MSFT
2011-10-03  374.60  173.29  24.53    2011-10-03  173.29  24.53
2011-10-04  372.50  174.74  25.34    2011-10-05  176.85  25.89
2011-10-05  378.25  176.85  25.89    2011-10-07  182.39  26.25
2011-10-06  377.37  181.69  26.34    2011-10-11  185.00  27.00
2011-10-07  369.80  182.39  26.25    2011-10-13  186.82  27.18
2011-10-10  388.81  186.62  26.94                             
2011-10-11  400.29  185.00  27.00                             
2011-10-12  402.19  186.12  26.96                             
2011-10-13  408.43  186.82  27.18                             
2011-10-14  422.00  190.53  27.27                             

In [130]:
df + df2


Out[130]:
AAPL IBM MSFT
2011-10-03 NaN 346.58 49.06
2011-10-04 NaN NaN NaN
2011-10-05 NaN 353.70 51.78
2011-10-06 NaN NaN NaN
2011-10-07 NaN 364.78 52.50
2011-10-10 NaN NaN NaN
2011-10-11 NaN 370.00 54.00
2011-10-12 NaN NaN NaN
2011-10-13 NaN 373.64 54.36
2011-10-14 NaN NaN NaN

In [131]:
b, c = df.align(df2, join='inner')
side_by_side(b, c)


               IBM   MSFT                   IBM   MSFT
2011-10-03  173.29  24.53    2011-10-03  173.29  24.53
2011-10-05  176.85  25.89    2011-10-05  176.85  25.89
2011-10-07  182.39  26.25    2011-10-07  182.39  26.25
2011-10-11  185.00  27.00    2011-10-11  185.00  27.00
2011-10-13  186.82  27.18    2011-10-13  186.82  27.18

Transposing: no copy if all columns are same type


In [132]:
df[:5].T


Out[132]:
2011-10-03 00:00:00 2011-10-04 00:00:00 2011-10-05 00:00:00 2011-10-06 00:00:00 2011-10-07 00:00:00
AAPL 374.60 372.50 378.25 377.37 369.80
IBM 173.29 174.74 176.85 181.69 182.39
MSFT 24.53 25.34 25.89 26.34 26.25

Columns can be any type


In [133]:
n = 10
foo = DataFrame(index=range(n))
foo['floats'] = np.random.randn(n)
foo['ints'] = np.arange(n)
foo['strings'] = ['foo', 'bar'] * (n / 2)
foo['bools'] = foo['floats'] > 0
foo['objects'] = pandas.date_range('1/1/2000', periods=n)
foo


Out[133]:
floats ints strings bools objects
0 2.529748 0 foo True 2000-01-01
1 -0.975089 1 bar False 2000-01-02
2 -1.093029 2 foo False 2000-01-03
3 -1.043257 3 bar False 2000-01-04
4 1.203834 4 foo True 2000-01-05
5 -0.974700 5 bar False 2000-01-06
6 0.427725 6 foo True 2000-01-07
7 0.995705 7 bar True 2000-01-08
8 -0.550763 8 foo False 2000-01-09
9 -1.495519 9 bar False 2000-01-10

In [134]:
foo.dtypes


Out[134]:
floats            float64
ints                int32
strings            object
bools                bool
objects    datetime64[ns]
dtype: object

N.B. transposing is not roundtrippable in this case (column-oriented data structure)


In [135]:
foo.T.T


Out[135]:
floats ints strings bools objects
0 2.52975 0 foo True 2000-01-01 00:00:00
1 -0.975089 1 bar False 2000-01-02 00:00:00
2 -1.09303 2 foo False 2000-01-03 00:00:00
3 -1.04326 3 bar False 2000-01-04 00:00:00
4 1.20383 4 foo True 2000-01-05 00:00:00
5 -0.9747 5 bar False 2000-01-06 00:00:00
6 0.427725 6 foo True 2000-01-07 00:00:00
7 0.995705 7 bar True 2000-01-08 00:00:00
8 -0.550763 8 foo False 2000-01-09 00:00:00
9 -1.49552 9 bar False 2000-01-10 00:00:00

In [136]:
foo.T.T.dtypes


Out[136]:
floats     object
ints       object
strings    object
bools      object
objects    object
dtype: object

Function application

You can apply arbitrary functions to the rows or columns of a DataFrame


In [137]:
df


Out[137]:
AAPL IBM MSFT
2011-10-03 374.60 173.29 24.53
2011-10-04 372.50 174.74 25.34
2011-10-05 378.25 176.85 25.89
2011-10-06 377.37 181.69 26.34
2011-10-07 369.80 182.39 26.25
2011-10-10 388.81 186.62 26.94
2011-10-11 400.29 185.00 27.00
2011-10-12 402.19 186.12 26.96
2011-10-13 408.43 186.82 27.18
2011-10-14 422.00 190.53 27.27

In [138]:
df.apply(np.mean)


Out[138]:
AAPL    389.424
IBM     182.405
MSFT     26.370
dtype: float64

In [139]:
df.mean()   # same as above.   df.mean? to look up the function usage.


Out[139]:
AAPL    389.424
IBM     182.405
MSFT     26.370
dtype: float64

In [140]:
df.mean(1)   # get the mean of the rows based on a column. Also excludes missing data (NaN: not a number).


Out[140]:
2011-10-03    190.806667
2011-10-04    190.860000
2011-10-05    193.663333
2011-10-06    195.133333
2011-10-07    192.813333
2011-10-10    200.790000
2011-10-11    204.096667
2011-10-12    205.090000
2011-10-13    207.476667
2011-10-14    213.266667
dtype: float64

In [141]:
df.mean(1, skipna=False)   # flag skipna: to include NaN. In this data, no NaN.


Out[141]:
2011-10-03    190.806667
2011-10-04    190.860000
2011-10-05    193.663333
2011-10-06    195.133333
2011-10-07    192.813333
2011-10-10    200.790000
2011-10-11    204.096667
2011-10-12    205.090000
2011-10-13    207.476667
2011-10-14    213.266667
dtype: float64

In [142]:
df.apply(np.mean, axis=1)   # same as above.


Out[142]:
2011-10-03    190.806667
2011-10-04    190.860000
2011-10-05    193.663333
2011-10-06    195.133333
2011-10-07    192.813333
2011-10-10    200.790000
2011-10-11    204.096667
2011-10-12    205.090000
2011-10-13    207.476667
2011-10-14    213.266667
dtype: float64

You can get as fancy as you want


In [143]:
close_px    # Output: 1000 rows.


Out[143]:
AA AAPL GE IBM JNJ MSFT PEP SPX XOM
2007-10-29 37.41 185.09 34.46 106.78 57.13 31.78 65.67 1540.98 85.51
2007-10-30 36.43 187.00 34.39 106.15 56.99 32.70 65.80 1531.02 83.25
2007-10-31 36.79 189.95 34.97 108.01 57.30 33.84 65.69 1549.38 84.03
2007-11-01 35.22 187.44 34.27 105.72 56.85 34.07 64.51 1508.44 80.84
2007-11-02 35.83 187.87 34.27 106.59 56.95 34.07 65.03 1509.65 80.32
2007-11-05 35.22 186.18 34.15 105.48 56.70 33.77 64.84 1502.17 80.07
2007-11-06 35.84 191.79 34.14 105.27 56.80 33.47 65.49 1520.27 82.56
2007-11-07 34.76 186.30 33.20 103.69 56.19 32.65 64.46 1475.62 79.96
2007-11-08 35.00 175.47 33.15 99.05 56.79 31.94 65.50 1474.77 82.00
2007-11-09 34.47 165.37 32.61 93.58 57.29 31.01 65.48 1453.70 79.64
2007-11-12 33.33 153.76 32.50 94.70 58.30 30.69 64.67 1439.18 77.52
2007-11-13 34.04 169.96 33.31 98.27 59.18 31.79 65.35 1481.05 79.67
2007-11-14 34.25 166.11 33.14 96.56 58.81 31.30 65.99 1470.58 79.15
2007-11-15 33.76 164.30 32.55 96.71 58.80 31.14 65.57 1451.15 77.48
2007-11-16 33.74 166.39 32.84 97.82 59.56 31.44 66.08 1458.74 78.04
2007-11-19 32.59 163.95 32.42 95.42 59.42 31.32 66.90 1433.27 77.13
2007-11-20 32.88 168.85 32.32 96.54 59.47 31.90 66.69 1439.70 80.53
2007-11-21 32.68 168.46 31.58 95.42 59.03 31.57 66.87 1416.77 79.82
2007-11-23 32.66 171.54 32.01 97.13 59.16 31.46 67.29 1440.70 80.96
2007-11-26 32.41 172.54 31.21 95.19 59.60 30.41 66.56 1407.22 78.57
2007-11-27 32.56 174.81 31.82 96.92 59.92 30.49 67.65 1428.23 79.21
2007-11-28 33.65 180.22 32.68 100.23 60.46 31.08 68.76 1469.02 80.62
2007-11-29 33.97 184.29 32.40 100.35 60.51 30.98 68.20 1469.72 81.24
2007-11-30 33.80 182.22 32.53 98.18 59.93 30.99 68.78 1481.14 81.76
2007-12-03 32.98 178.86 31.38 98.79 59.90 30.36 68.24 1472.42 81.48
2007-12-04 32.76 179.81 30.80 99.54 60.10 30.23 68.30 1462.79 80.81
2007-12-05 33.09 185.50 31.19 100.97 60.34 31.50 68.30 1485.01 82.46
2007-12-06 33.32 189.95 31.66 102.40 60.42 31.87 68.98 1507.34 83.85
2007-12-07 34.30 194.30 31.63 101.62 59.87 31.85 68.95 1504.66 83.91
2007-12-10 35.17 194.21 31.78 102.11 59.97 32.06 69.20 1515.96 84.39
... ... ... ... ... ... ... ... ... ...
2011-09-02 12.04 374.05 15.61 166.98 64.07 25.80 63.30 1173.97 72.14
2011-09-06 11.77 379.74 15.11 165.11 64.64 25.51 62.45 1165.24 71.15
2011-09-07 12.25 383.93 15.65 167.31 65.43 26.00 61.62 1198.62 73.65
2011-09-08 12.03 384.14 15.44 165.25 64.95 26.22 61.34 1185.90 72.82
2011-09-09 11.58 377.48 14.95 161.37 63.64 25.74 59.99 1154.23 71.01
2011-09-12 11.55 379.94 14.87 162.42 63.59 25.89 60.14 1162.27 71.84
2011-09-13 11.63 384.62 15.26 163.43 63.61 26.04 60.54 1172.87 71.65
2011-09-14 11.73 389.30 15.64 167.24 63.73 26.50 61.58 1188.68 72.64
2011-09-15 11.98 392.96 16.08 170.09 64.40 26.99 63.22 1209.11 74.01
2011-09-16 11.97 400.50 16.33 172.99 64.59 27.12 62.05 1216.01 74.55
2011-09-19 11.58 411.63 16.18 173.13 64.14 27.21 60.56 1204.09 73.70
2011-09-20 11.25 413.45 16.04 174.72 64.22 26.98 60.39 1202.09 74.01
2011-09-21 10.84 412.14 15.38 173.02 63.13 25.99 60.79 1166.76 71.97
2011-09-22 10.11 401.82 15.04 168.62 61.92 25.06 60.92 1129.56 69.24
2011-09-23 10.07 404.30 15.21 169.34 61.59 25.06 60.34 1136.43 69.31
2011-09-26 10.45 403.17 15.57 174.51 62.69 25.44 61.89 1162.95 71.72
2011-09-27 10.48 399.26 15.76 177.71 63.82 25.67 62.43 1175.38 72.91
2011-09-28 9.97 397.01 15.45 177.55 63.25 25.58 61.97 1151.06 72.07
2011-09-29 10.06 390.57 15.86 179.17 63.90 25.45 62.58 1160.40 73.88
2011-09-30 9.57 381.32 15.22 174.87 63.69 24.89 61.90 1131.42 72.63
2011-10-03 8.90 374.60 14.69 173.29 62.08 24.53 60.29 1099.23 71.15
2011-10-04 9.12 372.50 14.86 174.74 62.17 25.34 60.45 1123.95 72.83
2011-10-05 9.37 378.25 15.27 176.85 62.35 25.89 60.29 1144.03 73.95
2011-10-06 9.88 377.37 15.53 181.69 62.81 26.34 60.57 1164.97 73.89
2011-10-07 9.71 369.80 15.50 182.39 63.13 26.25 61.02 1155.46 73.56
2011-10-10 10.09 388.81 16.14 186.62 64.43 26.94 61.87 1194.89 76.28
2011-10-11 10.30 400.29 16.14 185.00 63.96 27.00 60.95 1195.54 76.27
2011-10-12 10.05 402.19 16.40 186.12 64.33 26.96 62.70 1207.25 77.16
2011-10-13 10.10 408.43 16.22 186.82 64.23 27.18 62.36 1203.66 76.37
2011-10-14 10.26 422.00 16.60 190.53 64.72 27.27 62.24 1224.58 78.11

1000 rows × 9 columns


In [144]:
close_px.AAPL   # For easier typing, after dot ., use tab to select the name of the stock.


Out[144]:
2007-10-29    185.09
2007-10-30    187.00
2007-10-31    189.95
2007-11-01    187.44
2007-11-02    187.87
2007-11-05    186.18
2007-11-06    191.79
2007-11-07    186.30
2007-11-08    175.47
2007-11-09    165.37
2007-11-12    153.76
2007-11-13    169.96
2007-11-14    166.11
2007-11-15    164.30
2007-11-16    166.39
2007-11-19    163.95
2007-11-20    168.85
2007-11-21    168.46
2007-11-23    171.54
2007-11-26    172.54
2007-11-27    174.81
2007-11-28    180.22
2007-11-29    184.29
2007-11-30    182.22
2007-12-03    178.86
2007-12-04    179.81
2007-12-05    185.50
2007-12-06    189.95
2007-12-07    194.30
2007-12-10    194.21
               ...  
2011-09-02    374.05
2011-09-06    379.74
2011-09-07    383.93
2011-09-08    384.14
2011-09-09    377.48
2011-09-12    379.94
2011-09-13    384.62
2011-09-14    389.30
2011-09-15    392.96
2011-09-16    400.50
2011-09-19    411.63
2011-09-20    413.45
2011-09-21    412.14
2011-09-22    401.82
2011-09-23    404.30
2011-09-26    403.17
2011-09-27    399.26
2011-09-28    397.01
2011-09-29    390.57
2011-09-30    381.32
2011-10-03    374.60
2011-10-04    372.50
2011-10-05    378.25
2011-10-06    377.37
2011-10-07    369.80
2011-10-10    388.81
2011-10-11    400.29
2011-10-12    402.19
2011-10-13    408.43
2011-10-14    422.00
Name: AAPL, dtype: float64

In [145]:
close_px.AAPL.idxmax() # gives index of its maximum value.


Out[145]:
Timestamp('2011-10-14 00:00:00')

In [146]:
close_px.AAPL.index[1]


Out[146]:
Timestamp('2007-10-30 00:00:00')

In [180]:
close_px.AAPL.index[close_px.AAPL.max()]  # If you are numpy aficionado, does the same thing.


Out[180]:
Timestamp('2009-07-02 00:00:00')

In [148]:
close_px.AA.idxmax()

def peak_date(series):
    return series.idxmax()

In [149]:
close_px.apply(peak_date)


Out[149]:
AA     2008-05-19
AAPL   2011-10-14
GE     2007-10-31
IBM    2011-10-14
JNJ    2011-07-07
MSFT   2007-11-01
PEP    2008-01-10
SPX    2007-10-31
XOM    2008-05-20
dtype: datetime64[ns]

In [150]:
for column in close_px:
    print close_px[column].max()


41.86
422.0
34.97
190.53
67.32
34.07
71.25
1549.38
87.48

In [151]:
def peak_date(series):
    return series.idxmax()    # new method: series.idxmax()
close_px.apply(peak_date, axis=0)


Out[151]:
AA     2008-05-19
AAPL   2011-10-14
GE     2007-10-31
IBM    2011-10-14
JNJ    2011-07-07
MSFT   2007-11-01
PEP    2008-01-10
SPX    2007-10-31
XOM    2008-05-20
dtype: datetime64[ns]

In [152]:
df.apply(lambda x: x.max())


Out[152]:
AAPL    422.00
IBM     190.53
MSFT     27.27
dtype: float64

In [153]:
df.apply(lambda x: x.max() - x.min()) # np.ptp


Out[153]:
AAPL    52.20
IBM     17.24
MSFT     2.74
dtype: float64

In [154]:
np.log(close_px)


Out[154]:
AA AAPL GE IBM JNJ MSFT PEP SPX XOM
2007-10-29 3.621938 5.220842 3.539799 4.670771 4.045329 3.458837 4.184642 7.340174 4.448633
2007-10-30 3.595393 5.231109 3.537766 4.664853 4.042876 3.487375 4.186620 7.333689 4.421848
2007-10-31 3.605226 5.246761 3.554491 4.682224 4.048301 3.521644 4.184947 7.345610 4.431174
2007-11-01 3.561614 5.233459 3.534270 4.660794 4.040416 3.528417 4.166820 7.318831 4.392472
2007-11-02 3.578786 5.235750 3.534270 4.668990 4.042174 3.528417 4.174849 7.319633 4.386019
2007-11-05 3.561614 5.226714 3.530763 4.658521 4.037774 3.519573 4.171923 7.314666 4.382901
2007-11-06 3.579065 5.256401 3.530470 4.656528 4.039536 3.510650 4.181897 7.326643 4.413525
2007-11-07 3.548467 5.227358 3.502550 4.641406 4.028739 3.485845 4.166045 7.296834 4.381527
2007-11-08 3.555348 5.167468 3.501043 4.595625 4.039360 3.463859 4.182050 7.296257 4.406719
2007-11-09 3.540089 5.108185 3.484619 4.538817 4.048126 3.434310 4.181745 7.281867 4.377516
2007-11-12 3.506458 5.035393 3.481240 4.550714 4.065602 3.423937 4.169297 7.271829 4.350536
2007-11-13 3.527536 5.135563 3.505858 4.587719 4.080584 3.459152 4.179757 7.300507 4.377893
2007-11-14 3.533687 5.112650 3.500741 4.570165 4.074312 3.443618 4.189503 7.293412 4.371345
2007-11-15 3.519277 5.101694 3.482777 4.571717 4.074142 3.438493 4.183118 7.280112 4.350020
2007-11-16 3.518684 5.114334 3.491647 4.583129 4.086984 3.448081 4.190866 7.285328 4.357222
2007-11-19 3.484005 5.099562 3.478776 4.558288 4.084631 3.444257 4.203199 7.267714 4.345492
2007-11-20 3.492865 5.129011 3.475686 4.569957 4.085472 3.462606 4.200055 7.272190 4.388630
2007-11-21 3.486763 5.126698 3.452524 4.558288 4.078046 3.452207 4.202750 7.256135 4.379774
2007-11-23 3.486151 5.144816 3.466048 4.576050 4.080246 3.448717 4.209012 7.272884 4.393955
2007-11-26 3.478467 5.150629 3.440739 4.555875 4.087656 3.414772 4.198104 7.249371 4.363990
2007-11-27 3.483085 5.163700 3.460095 4.573886 4.093010 3.417399 4.214347 7.264191 4.372103
2007-11-28 3.516013 5.194178 3.486763 4.607468 4.101982 3.436565 4.230622 7.292351 4.389747
2007-11-29 3.525478 5.216511 3.478158 4.608664 4.102809 3.433342 4.222445 7.292827 4.397408
2007-11-30 3.520461 5.205215 3.482163 4.586803 4.093177 3.433665 4.230913 7.300567 4.403788
2007-12-03 3.495901 5.186603 3.446171 4.592996 4.092677 3.413126 4.223031 7.294663 4.400358
2007-12-04 3.489208 5.191901 3.427515 4.600560 4.096010 3.408835 4.223910 7.288101 4.392101
2007-12-05 3.499231 5.223055 3.440098 4.614823 4.099995 3.449988 4.223910 7.303177 4.412313
2007-12-06 3.506158 5.246761 3.455054 4.628887 4.101320 3.461665 4.233817 7.318102 4.429029
2007-12-07 3.535145 5.269403 3.454106 4.621240 4.092176 3.461037 4.233382 7.316322 4.429745
2007-12-10 3.560193 5.268940 3.458837 4.626051 4.093844 3.467609 4.237001 7.323804 4.435449
... ... ... ... ... ... ... ... ... ...
2011-09-02 2.488234 5.924389 2.747912 5.117874 4.159976 3.250374 4.147885 7.068146 4.278609
2011-09-06 2.465554 5.939487 2.715357 5.106612 4.168833 3.239071 4.134366 7.060682 4.264790
2011-09-07 2.505526 5.950460 2.750471 5.119848 4.180981 3.258097 4.120986 7.088926 4.299324
2011-09-08 2.487404 5.951007 2.736962 5.107459 4.173618 3.266522 4.116432 7.078257 4.287991
2011-09-09 2.449279 5.933518 2.704711 5.083700 4.153242 3.248046 4.094178 7.051189 4.262821
2011-09-12 2.446685 5.940013 2.699346 5.090186 4.152456 3.253857 4.096675 7.058130 4.274441
2011-09-13 2.453588 5.952256 2.725235 5.096385 4.152771 3.259634 4.103304 7.067209 4.271793
2011-09-14 2.462150 5.964350 2.749832 5.119430 4.154655 3.277145 4.120337 7.080599 4.285516
2011-09-15 2.483239 5.973708 2.777576 5.136328 4.165114 3.295466 4.146621 7.097640 4.304200
2011-09-16 2.482404 5.992714 2.793004 5.153234 4.168060 3.300271 4.127941 7.103330 4.311470
2011-09-19 2.449279 6.020125 2.783776 5.154043 4.161068 3.303585 4.103635 7.093479 4.300003
2011-09-20 2.420368 6.024537 2.775086 5.163185 4.162315 3.295096 4.100824 7.091817 4.304200
2011-09-21 2.383243 6.021363 2.733068 5.153407 4.145196 3.257712 4.107425 7.061986 4.276249
2011-09-22 2.313525 5.996004 2.710713 5.127648 4.125843 3.221273 4.109562 7.029583 4.237579
2011-09-23 2.309561 6.002157 2.721953 5.131909 4.120500 3.221273 4.099995 7.035647 4.238589
2011-09-26 2.346602 5.999358 2.745346 5.161982 4.138202 3.236323 4.125359 7.058715 4.272770
2011-09-27 2.349469 5.989613 2.757475 5.180153 4.156067 3.245323 4.134046 7.069347 4.289226
2011-09-28 2.299581 5.983961 2.737609 5.179252 4.147095 3.241811 4.126650 7.048439 4.277638
2011-09-29 2.308567 5.967607 2.763800 5.188335 4.157319 3.236716 4.136446 7.056520 4.302442
2011-09-30 2.258633 5.943639 2.722610 5.164043 4.154028 3.214466 4.125520 7.031229 4.285378
2011-10-03 2.186051 5.925859 2.687167 5.154966 4.128424 3.199897 4.099166 7.002365 4.264790
2011-10-04 2.210470 5.920237 2.698673 5.163299 4.129873 3.232384 4.101817 7.024605 4.288128
2011-10-05 2.237513 5.935555 2.725890 5.175302 4.132764 3.253857 4.099166 7.042312 4.303389
2011-10-06 2.290513 5.933226 2.742774 5.202302 4.140114 3.271089 4.103800 7.060451 4.302578
2011-10-07 2.273156 5.912962 2.740840 5.206147 4.145196 3.267666 4.111202 7.052254 4.298101
2011-10-10 2.311545 5.963091 2.781301 5.229074 4.165579 3.293612 4.125035 7.085809 4.334411
2011-10-11 2.332144 5.992189 2.781301 5.220356 4.158258 3.295837 4.110054 7.086353 4.334280
2011-10-12 2.307573 5.996925 2.797281 5.226392 4.164026 3.294354 4.138361 7.096100 4.345881
2011-10-13 2.312535 6.012321 2.786245 5.230146 4.162470 3.302481 4.132924 7.093122 4.335590
2011-10-14 2.328253 6.045005 2.809403 5.249810 4.170070 3.305787 4.130998 7.110353 4.358118

1000 rows × 9 columns

Hierarchical indexing


In [155]:
index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
                                   ['one', 'two', 'three']],
                           labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
                                   [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]])
hdf = DataFrame(np.random.randn(10, 3), index=index,
                columns=['A', 'B', 'C'])
hdf


Out[155]:
A B C
foo one 0.623840 -2.410687 -0.801873
two 0.508569 0.768385 -0.567528
three -0.237210 -0.184590 2.709648
bar one 0.246020 0.636503 1.029029
two -0.406674 -0.057052 -1.458452
baz two -0.921052 0.874554 -1.078501
three -0.282513 0.773144 -0.665951
qux one -0.080384 -0.563710 0.251808
two -1.333727 -0.078392 0.529107
three -0.517454 1.104363 -1.954480

In [156]:
hdf.ix['foo']


Out[156]:
A B C
one 0.623840 -2.410687 -0.801873
two 0.508569 0.768385 -0.567528
three -0.237210 -0.184590 2.709648

In [157]:
hdf.ix['foo'] = 0
hdf


Out[157]:
A B C
foo one 0.000000 0.000000 0.000000
two 0.000000 0.000000 0.000000
three 0.000000 0.000000 0.000000
bar one 0.246020 0.636503 1.029029
two -0.406674 -0.057052 -1.458452
baz two -0.921052 0.874554 -1.078501
three -0.282513 0.773144 -0.665951
qux one -0.080384 -0.563710 0.251808
two -1.333727 -0.078392 0.529107
three -0.517454 1.104363 -1.954480

In [158]:
hdf.ix['bar','one']['A']


Out[158]:
0.24601987202663883

Stacking and unstacking


In [159]:
tuples = zip(*[['bar', 'bar', 'baz', 'baz',
                'foo', 'foo', 'qux', 'qux'],
               ['one', 'two', 'one', 'two',
                'one', 'two', 'one', 'two']])
index = MultiIndex.from_tuples(tuples)
columns = MultiIndex.from_tuples([('A', 'cat'), ('B', 'dog'),
                                  ('B', 'cat'), ('A', 'dog')])
df = DataFrame(np.random.randn(8, 4), index=index, columns=columns)
df


Out[159]:
A B A
cat dog cat dog
bar one -2.631352 -0.513803 -1.037200 -1.468317
two 0.099119 -1.894265 -0.658288 -0.704874
baz one -0.072433 -0.843239 -0.907227 2.576575
two 0.743173 -0.151601 -0.561562 -0.343702
foo one -1.114722 -0.368340 -0.715684 0.608736
two 1.048899 -0.887805 -1.184470 0.858906
qux one -0.426258 0.870646 0.660788 -0.582021
two -0.071411 1.484308 0.086960 0.155612

In [160]:
df2 = df.ix[[0, 1, 2, 4, 5, 7]]
df2


Out[160]:
A B A
cat dog cat dog
bar one -2.631352 -0.513803 -1.037200 -1.468317
two 0.099119 -1.894265 -0.658288 -0.704874
baz one -0.072433 -0.843239 -0.907227 2.576575
foo one -1.114722 -0.368340 -0.715684 0.608736
two 1.048899 -0.887805 -1.184470 0.858906
qux two -0.071411 1.484308 0.086960 0.155612

In [161]:
df.unstack()['B']


Out[161]:
dog cat
one two one two
bar -0.513803 -1.894265 -1.037200 -0.658288
baz -0.843239 -0.151601 -0.907227 -0.561562
foo -0.368340 -0.887805 -0.715684 -1.184470
qux 0.870646 1.484308 0.660788 0.086960

GroupBy


In [162]:
df = DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
                       'foo', 'bar', 'foo', 'foo'],
                'B' : ['one', 'one', 'two', 'three',
                       'two', 'two', 'one', 'three'],
                'C' : np.random.randn(8),
                'D' : np.random.randn(8)})
df


Out[162]:
A B C D
0 foo one -0.121817 -0.837150
1 bar one 0.078895 -1.937235
2 foo two -0.968802 0.194740
3 bar three -0.214375 -0.367901
4 foo two 0.573654 0.919633
5 bar two -0.087236 -0.830693
6 foo one -0.578651 -1.405809
7 foo three 0.928010 -0.037234

In [163]:
for key, group in df.groupby('A'):
    print key
    print group


bar
     A      B         C         D
1  bar    one  0.078895 -1.937235
3  bar  three -0.214375 -0.367901
5  bar    two -0.087236 -0.830693
foo
     A      B         C         D
0  foo    one -0.121817 -0.837150
2  foo    two -0.968802  0.194740
4  foo    two  0.573654  0.919633
6  foo    one -0.578651 -1.405809
7  foo  three  0.928010 -0.037234

In [164]:
df.groupby('A')['C'].describe().T


Out[164]:
A         
bar  count    3.000000
     mean    -0.074238
     std      0.147067
     min     -0.214375
     25%     -0.150805
     50%     -0.087236
     75%     -0.004170
     max      0.078895
foo  count    5.000000
     mean    -0.033521
     std      0.786275
     min     -0.968802
     25%     -0.578651
     50%     -0.121817
     75%      0.573654
     max      0.928010
Name: C, dtype: float64

In [165]:
df.groupby('A').mean()


Out[165]:
C D
A
bar -0.074238 -1.045276
foo -0.033521 -0.233164

In [166]:
for key, group in df.groupby('A'):
    print key
    print group


bar
     A      B         C         D
1  bar    one  0.078895 -1.937235
3  bar  three -0.214375 -0.367901
5  bar    two -0.087236 -0.830693
foo
     A      B         C         D
0  foo    one -0.121817 -0.837150
2  foo    two -0.968802  0.194740
4  foo    two  0.573654  0.919633
6  foo    one -0.578651 -1.405809
7  foo  three  0.928010 -0.037234

In [167]:
df.groupby(['A', 'B']).mean()


Out[167]:
C D
A B
bar one 0.078895 -1.937235
three -0.214375 -0.367901
two -0.087236 -0.830693
foo one -0.350234 -1.121479
three 0.928010 -0.037234
two -0.197574 0.557186

In [168]:
#df.groupby(['A', 'B'], as_index=False).mean()

In [169]:
df.stack()


Out[169]:
0  A          foo
   B          one
   C    -0.121817
   D     -0.83715
1  A          bar
   B          one
   C    0.0788954
   D     -1.93724
2  A          foo
   B          two
   C    -0.968802
   D      0.19474
3  A          bar
   B        three
   C    -0.214375
   D    -0.367901
4  A          foo
   B          two
   C     0.573654
   D     0.919633
5  A          bar
   B          two
   C   -0.0872356
   D    -0.830693
6  A          foo
   B          one
   C    -0.578651
   D     -1.40581
7  A          foo
   B        three
   C      0.92801
   D   -0.0372343
dtype: object

In [170]:
#df.stack().mean(1).unstack()

In [171]:
#df.groupby(level=1, axis=1).mean()

Crosstabs


In [185]:
movs = pd.read_csv('movie_metadata.csv')
movs.columns
movs.columns = [c.replace("_", " ") for c in movs.columns]
movs.columns


Out[185]:
Index([u'color', u'director_name', u'num_critic_for_reviews', u'duration',
       u'director_facebook_likes', u'actor_3_facebook_likes', u'actor_2_name',
       u'actor_1_facebook_likes', u'gross', u'genres', u'actor_1_name',
       u'movie_title', u'num_voted_users', u'cast_total_facebook_likes',
       u'actor_3_name', u'facenumber_in_poster', u'plot_keywords',
       u'movie_imdb_link', u'num_user_for_reviews', u'language', u'country',
       u'content_rating', u'budget', u'title_year', u'actor_2_facebook_likes',
       u'imdb_score', u'aspect_ratio', u'movie_facebook_likes'],
      dtype='object')

In [238]:
_movs = movs[movs.title_year.ge(2000) & movs.country.isin(['USA', 'UK'])]
_df = pd.crosstab(_movs.title_year, _movs.country)
_df
#_df = pd.crosstab(_movs.title_year, _movs.country, values=_movs.actor_1_facebook_likes, aggfunc=[sum, mean, median])
#_df
#pd.crosstab(_movs.title_year, _movs.country).apply(lambda r: (r/r.sum())*100, axis=1)


Out[238]:
country UK USA
title_year
2000.0 16 136
2001.0 12 149
2002.0 22 157
2003.0 6 135
2004.0 21 156
2005.0 23 154
2006.0 19 165
2007.0 19 151
2008.0 23 163
2009.0 21 184
2010.0 21 167
2011.0 20 164
2012.0 18 173
2013.0 16 171
2014.0 17 187
2015.0 17 171
2016.0 13 77

In [231]:
movs[['imdb_score']].hist()


Out[231]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x0C834730>]], dtype=object)

In [241]:
### IPython Gyan!

%who DataFrame


b	 c	 close_px	 df	 df2	 foo	 group	 hdf	 movies	 
movs	 

In [240]:
%whos DataFrame


Variable   Type         Data/Info
---------------------------------
b          DataFrame                   IBM   MSFT<...>2011-10-13  186.82  27.18
c          DataFrame                   IBM   MSFT<...>2011-10-13  186.82  27.18
close_px   DataFrame                   AA    AAPL<...>\n[1000 rows x 9 columns]
df         DataFrame         A      B         C  <...>three  0.928010 -0.037234
df2        DataFrame                    A        <...>84308  0.086960  0.155612
foo        DataFrame         floats  ints strings<...>    bar  False 2000-01-10
group      DataFrame         A      B         C  <...>three  0.928010 -0.037234
hdf        DataFrame                      A      <...>17454  1.104363 -1.954480
movies     DataFrame                     color   <...>n[5043 rows x 28 columns]
movs       DataFrame                     color   <...>n[5043 rows x 28 columns]

In [239]:
%who_ls DataFrame


Out[239]:
['b', 'c', 'close_px', 'df', 'df2', 'foo', 'group', 'hdf', 'movies', 'movs']

In [247]:
sal = pd.read_csv(r'Salaries.csv')

In [311]:
sal['EmpFirstLetter'] = sal.EmployeeName.str[:1]
sal.pivot_table(index=['EmpFirstLetter'], columns=['Year'], values=['TotalPay'], aggfunc=[median, mean])


Out[311]:
median mean
TotalPay TotalPay
Year 2011 2012 2013 2014 2011 2012 2013 2014
EmpFirstLetter
A 65162.425 67560.735 71035.865 69012.000 67678.940063 69174.704277 73246.724899 72014.376458
B 71707.690 74225.175 78092.545 75498.910 74543.945176 77715.068752 79856.911802 77502.710600
C 66977.485 69727.000 72340.750 70165.095 68925.002466 71722.499686 74397.533451 73114.425919
D 73204.060 74934.190 80679.630 78325.550 75766.202913 78032.387262 82899.460636 80088.153392
E 65568.840 68598.200 72088.370 70594.135 69363.285091 73816.857244 76070.208850 74589.775289
F 67905.700 71177.370 76036.850 74918.880 71284.952568 74964.046448 79596.529293 78474.037458
G 71518.770 74093.000 77015.390 75986.035 76319.672653 78476.724127 80202.309589 78394.994790
H 67372.240 69284.275 71978.680 69302.230 70750.774931 72069.438369 76075.891995 72170.357107
I 64318.470 68073.530 66010.630 64360.530 63171.944882 67548.086261 67105.736514 65798.987690
J 71904.130 72582.000 76254.130 73772.100 73903.106667 76115.170751 79863.984656 77242.279871
K 68686.820 70836.070 74154.020 73102.010 72146.136918 74136.158000 77459.284115 75257.957574
L 65204.230 67525.510 70305.850 68737.520 67306.284486 69216.468480 71866.205683 69712.594843
M 68333.250 71064.070 75134.690 73444.590 73058.289696 75299.933923 79324.472191 77100.219620
N 63699.155 66590.050 72262.455 69507.155 67228.537056 69808.706896 74803.544698 73625.185565
O 67541.010 71394.600 75419.225 70534.830 68446.813004 72707.401382 75976.040134 72468.391195
P 71547.415 73227.155 78534.860 76236.525 75090.559506 77347.005092 81579.031957 80173.364359
Q 56238.595 59072.490 60534.870 62411.020 56023.475370 57311.629123 59008.182742 61424.621148
R 72288.165 73975.010 78300.900 76551.085 75740.907585 77697.462048 82172.628127 80036.289923
S 67600.200 69183.280 74097.200 70412.300 71007.312985 73348.068968 77644.063781 74255.508392
T 68660.785 71705.510 76263.600 73496.485 73567.004663 77501.463236 80617.462602 77044.068207
U 52307.120 67189.530 72930.840 72800.970 60302.572162 66102.203429 75601.140588 74737.448333
V 65691.940 68164.600 69452.150 66685.045 67790.889391 69236.273110 71174.767021 68694.508972
W 70921.910 72693.120 73627.450 74009.260 74572.959963 76607.886293 77750.351835 78739.229988
X 49571.435 51260.430 54684.325 55711.590 48824.644571 50073.505854 50378.596771 54246.543627
Y 58916.960 61637.340 62918.770 60854.055 60227.668223 63341.630736 66924.917970 64432.983333
Z 55655.085 56818.265 56719.100 55327.000 59245.056791 59806.612639 58471.202250 55603.639548

In [314]:
sal['PayMark'] = 'More than 100k'
sal.loc[sal.TotalPay.le(100000), 'PayMark'] = 'Less than 100k'
sal.PayMark.unique()


Out[314]:
array(['More than 100k', 'Less than 100k'], dtype=object)

In [316]:
sal.groupby('PayMark')['Id'].count()
#sal.PayMark.value_counts()


Out[316]:
Less than 100k    106687
More than 100k     41967
Name: PayMark, dtype: int64

In [2]:
import sys
sys.version


Out[2]:
'2.7.11 |Anaconda 2.5.0 (32-bit)| (default, Mar  4 2016, 15:18:41) [MSC v.1500 32 bit (Intel)]'

In [3]:
import pandas as pd
pd.__version__


Out[3]:
u'0.18.1'